library(tidyverse)
library(insuranceData)
library(plotly)
data(dataCar)
str(dataCar)
## 'data.frame': 67856 obs. of 11 variables:
## $ veh_value: num 1.06 1.03 3.26 4.14 0.72 2.01 1.6 1.47 0.52 0.38 ...
## $ exposure : num 0.304 0.649 0.569 0.318 0.649 ...
## $ clm : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numclaims: int 0 0 0 0 0 0 0 0 0 0 ...
## $ claimcst0: num 0 0 0 0 0 0 0 0 0 0 ...
## $ veh_body : Factor w/ 13 levels "BUS","CONVT",..: 4 4 13 11 4 5 8 4 4 4 ...
## $ veh_age : int 3 2 2 2 4 3 3 2 4 4 ...
## $ gender : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 2 1 1 ...
## $ area : Factor w/ 6 levels "A","B","C","D",..: 3 1 5 4 3 3 1 2 1 2 ...
## $ agecat : int 2 4 2 2 2 4 4 6 3 4 ...
## $ X_OBSTAT_: Factor w/ 1 level "01101 0 0 0": 1 1 1 1 1 1 1 1 1 1 ...
head(dataCar)
dataCar %>% ggplot(mapping = aes(x = numclaims)) +
geom_bar(fill = "navyblue")
dataCar %>% ggplot(mapping = aes(x = agecat)) +
geom_bar(fill = "navyblue")
dataCar %>% ggplot(mapping = aes(x = gender)) +
geom_bar(fill = "navyblue")
dataCar %>% mutate(veh_value = (veh_value > 7)*7 + veh_value*(veh_value <= 7)) %>%
arrange(-veh_value) %>%
ggplot(mapping = aes(veh_value)) +
geom_histogram(fill = "navyblue")
### c)
dataCar %>%
group_by(agecat) %>%
summarize(avg_claims = mean(numclaims)) %>%
ggplot(mapping = aes(x = agecat, y = avg_claims)) +
geom_point(fill = "navy")
The older the owner the smaller the average claim gets. That makes
sense, since younger driver may drive more reckless than older
people.
summary(dataCar$veh_value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.010 1.500 1.777 2.150 34.560
plot <- dataCar %>% mutate(veh_value_bin = ntile(veh_value, n=4)) %>%
group_by(veh_value_bin) %>%
summarize(avg_claims = mean(numclaims)) %>%
ggplot(mapping = aes(x = veh_value_bin, y = avg_claims)) +
geom_point(fill = "navy")
plot
The higher the price of the car, the higher are the average claims.
(plot) %>% ggplotly()